# coding=utf-8
"""
Author  : Jane
Contact : xijian@ict.ac.cn
Time    : 2021/4/2 11:20
Desc:
0：1军事演习，1：4军力部署，2：5区域冲突，3：6武器研发，4：7非军事
"""
import pandas as pd
from matplotlib import pyplot as plt

filepath = 'predict_online_4_1_guanzhu.txt'
text_set = set()
write_filepath = 'predict_online_4_1_guanzhu_filter_duplicated.txt'
"""
f_w = open(write_filepath, 'w', encoding='UTF-8')
df_data = pd.DataFrame(columns=['content', 'label'])
with open(filepath, 'r', encoding='UTF-8') as f:
    for line in f:
        # print(line)
        line = line.strip().split('\t')
        text = "".join(line[:-1])
        label = line[-1]
        if text not in text_set:
            text_set.add(text)
            f_w.write(text+"\t"+label+'\n')
            df_data.loc[df_data.shape[0]] = {'content':text, 'label':int(label)}
print(f'一共有{len(text_set)}个样本') # 25573
f_w.close()

print(df_data.head())
print(df_data['label'].value_counts())
df_data.label.hist(bins=30)
plt.show()
"""

df_data = pd.read_csv('predict_online_4_1_guanzhu_filter_duplicated.txt', encoding='UTF-8', sep='\t', index_col=False,
                      names=['content', 'label'])
print(df_data.head())
df_samples = pd.DataFrame(columns=['content', 'label'])
df_sample0 = df_data[df_data.label==0]
print(df_sample0.iloc[:100,:].shape)
df_sample1 = df_data[df_data.label==1]
df_sample2 = df_data[df_data.label==2]
df_sample3 = df_data[df_data.label==3]
df_sample4 = df_data[df_data.label==4]

"""
df_samples = df_samples.append(df_sample0.iloc[:100,:], ignore_index=True)
df_samples = df_samples.append(df_sample1.iloc[:100,:], ignore_index=True)
df_samples = df_samples.append(df_sample2.iloc[:100,:], ignore_index=True)
df_samples = df_samples.append(df_sample3.iloc[:100,:], ignore_index=True)
df_samples = df_samples.append(df_sample4.iloc[:100,:], ignore_index=True)
"""
df_samples = df_samples.append(df_sample0.iloc[100:200,:], ignore_index=True)
df_samples = df_samples.append(df_sample1.iloc[100:200,:], ignore_index=True)
df_samples = df_samples.append(df_sample2.iloc[100:200,:], ignore_index=True)
df_samples = df_samples.append(df_sample3.iloc[100:200,:], ignore_index=True)
df_samples = df_samples.append(df_sample4.iloc[100:200,:], ignore_index=True)

print(df_samples.shape)
print(df_samples.head())
df_samples = df_samples.sample(frac=1.0) # shuffle
print(df_samples.shape)
print(df_samples.head())
"""
df_samples.to_csv('split/predict_online_4_1_guanzhu_filter_duplicated_sample500.txt', sep='\t',
                  index=False, encoding='UTF-8')
df_samples['content'].to_csv('split/predict_online_4_1_guanzhu_filter_duplicated_sample500_content.txt', sep='\t',
                  index=False, encoding='UTF-8')
"""
df_samples.to_csv('split/predict_online_4_1_guanzhu_filter_duplicated_sample500_2.txt', sep='\t',
                  index=False, encoding='UTF-8')
df_samples['content'].to_csv('split/predict_online_4_1_guanzhu_filter_duplicated_sample500_2_content.txt', sep='\t',
                  index=False, encoding='UTF-8')
